Predict sales prices and practice feature engineering, RFs, and gradient boosting

rm(list=ls())
PROJ_PATH <- '~/Documents/kaggle/house_prices'

load(file.path(PROJ_PATH, 'data/house_prices.Rdata'))

# Plotting functions
hist2 <- function(..., breaks=30, col='darkgray', xlab=NULL){
  hist(..., breaks=breaks, col=col, border=col, xlab=xlab)
}

barplot2 <- function(..., col='darkgray', xlab=NULL){
  barplot(..., col=col, border=col, horiz=TRUE)
}

plot2 <- function(..., col=adjustcolor('gray30', alpha.f=0.2), bty='n'){
  plot(..., col=col, bty=bty)
}

Data description

dim(train)
## [1] 1460   81
names(train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"

Histograms of numeric variables

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in num_idx){
  hist2(train[[ni]], main=nm[ni])
}

hist2(train[['SalePrice']], main='SalePrice')

Frequencies of categorical variables

par(las=2)
par(mar=c(3,4,3,3))
par(mfrow = c(1, 4))

for (ci in cat_idx){
  barplot2(table(train[[ci]]), main=nm[ci])
}

Missingness of variables (only those with any values missing)

missing_perc <- sort(sapply(train, function(x) sum(is.na(x)) / length(x)))

par(las=2)
par(mar=c(3,7,2,2))
barplot2(missing_perc[missing_perc>0],
         cex.names=0.6, cex.axis=0.6, xlim=c(0,1),
         main='Missingness')

Relation of each variable to price

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in num_idx){
  plot2(train[,c(nm[ni], 'SalePrice'),],
       main=nm[ni], ylim=range(train$SalePrice))
}

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in cat_idx){
  plot2(factor(train[[ni]]), train$SalePrice,
       main=nm[ni], ylim=range(train$SalePrice),
       frame=FALSE)
}

Top correlated numeric variables (pairwise-complete observations)

cor_mat <- cor(train[,num_idx], use='pairwise.complete')

not_na_mat <- !is.na(as.matrix(train[,num_idx]))
pairwise_completeness <- t(not_na_mat) %*% not_na_mat / nrow(train)

cor_df <- data.frame(X1=rep(colnames(cor_mat), each=ncol(cor_mat)),
                     X2=rep(colnames(cor_mat), times=ncol(cor_mat)),
                     cor=as.vector(cor_mat),
                     perc_complete=as.vector(pairwise_completeness))

cor_df <- cor_df[with(cor_df, X1!=X2),]
cor_df <- cor_df[order(abs(cor_df$cor), decreasing=TRUE),]
knitr::kable(cor_df[abs(cor_df$cor) > 0.50,], row.names=FALSE)
X1 X2 cor perc_complete
GarageCars GarageArea 0.8824754 1.0000000
GarageArea GarageCars 0.8824754 1.0000000
YearBuilt GarageYrBlt 0.8256675 0.9445205
GarageYrBlt YearBuilt 0.8256675 0.9445205
GrLivArea TotRmsAbvGrd 0.8254894 1.0000000
TotRmsAbvGrd GrLivArea 0.8254894 1.0000000
TotalBsmtSF X1stFlrSF 0.8195300 1.0000000
X1stFlrSF TotalBsmtSF 0.8195300 1.0000000
X2ndFlrSF GrLivArea 0.6875011 1.0000000
GrLivArea X2ndFlrSF 0.6875011 1.0000000
BedroomAbvGr TotRmsAbvGrd 0.6766199 1.0000000
TotRmsAbvGrd BedroomAbvGr 0.6766199 1.0000000
BsmtFinSF1 BsmtFullBath 0.6492118 1.0000000
BsmtFullBath BsmtFinSF1 0.6492118 1.0000000
YearRemodAdd GarageYrBlt 0.6422768 0.9445205
GarageYrBlt YearRemodAdd 0.6422768 0.9445205
GrLivArea FullBath 0.6300116 1.0000000
FullBath GrLivArea 0.6300116 1.0000000
X2ndFlrSF TotRmsAbvGrd 0.6164226 1.0000000
TotRmsAbvGrd X2ndFlrSF 0.6164226 1.0000000
X2ndFlrSF HalfBath 0.6097073 1.0000000
HalfBath X2ndFlrSF 0.6097073 1.0000000
OverallQual GarageCars 0.6006707 1.0000000
GarageCars OverallQual 0.6006707 1.0000000
OverallQual GrLivArea 0.5930074 1.0000000
GrLivArea OverallQual 0.5930074 1.0000000
YearBuilt YearRemodAdd 0.5928550 1.0000000
YearRemodAdd YearBuilt 0.5928550 1.0000000
GarageYrBlt GarageCars 0.5889200 0.9445205
GarageCars GarageYrBlt 0.5889200 0.9445205
OverallQual YearBuilt 0.5723228 1.0000000
YearBuilt OverallQual 0.5723228 1.0000000
X1stFlrSF GrLivArea 0.5660240 1.0000000
GrLivArea X1stFlrSF 0.5660240 1.0000000
GarageYrBlt GarageArea 0.5645671 0.9445205
GarageArea GarageYrBlt 0.5645671 0.9445205
OverallQual GarageArea 0.5620218 1.0000000
GarageArea OverallQual 0.5620218 1.0000000
FullBath TotRmsAbvGrd 0.5547843 1.0000000
TotRmsAbvGrd FullBath 0.5547843 1.0000000
OverallQual YearRemodAdd 0.5506839 1.0000000
YearRemodAdd OverallQual 0.5506839 1.0000000
OverallQual FullBath 0.5505997 1.0000000
FullBath OverallQual 0.5505997 1.0000000
OverallQual GarageYrBlt 0.5477658 0.9445205
GarageYrBlt OverallQual 0.5477658 0.9445205
YearBuilt GarageCars 0.5378501 1.0000000
GarageCars YearBuilt 0.5378501 1.0000000
OverallQual TotalBsmtSF 0.5378085 1.0000000
TotalBsmtSF OverallQual 0.5378085 1.0000000
BsmtFinSF1 TotalBsmtSF 0.5223961 1.0000000
TotalBsmtSF BsmtFinSF1 0.5223961 1.0000000
GrLivArea BedroomAbvGr 0.5212695 1.0000000
BedroomAbvGr GrLivArea 0.5212695 1.0000000
X2ndFlrSF BedroomAbvGr 0.5029006 1.0000000
BedroomAbvGr X2ndFlrSF 0.5029006 1.0000000